library(ggplot2)
library(tidyverse)
library(lubridate)
library(stringr)
library(forcats)

Chicago taxi data

https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew/data

Data description: This is data from taxi trips reported to the city of Chicago. There are total 112860054 rows and 23 variables including information about the taxi, time, location and expenses.

Time estimation: download for 100 seconds, 57.1MB, 194455 rows. So the time takes to download whole dataset should be about 16 hours. Size of whole data set should be about 33GB.

df = read.csv("Taxi_Trips.csv")
df

Analysis

6. After you have data, Make table with day of the week in the column name, Length of a trip in vertical (you will have to cut it, to let say 5 intervals), and average tip in cells

6a. Be carefull - if you apply cut blindly, using intervals, outliers (if present) can make this cutting useless. You need to see how many records there are in every group.

  • Clean variable “Tips”
df$Tips <- str_replace_all(df$Tips, "\\$", "") %>%
  as.numeric()
  • Get three variables
table_tips <- df %>% 
  select(Trip.Start.Timestamp, Trip.End.Timestamp, Tips) %>%
  mutate(Start_date = mdy_hms(Trip.Start.Timestamp))  %>%
  mutate(wkday = wday(Start_date, label = TRUE)) %>%
  separate(Start_date, c("st_date","st_time"), sep = " ") %>%
  separate(st_time, c("st_hr","st_min","st_sc"), sep = ":") %>%
  mutate(End_date = mdy_hms(Trip.End.Timestamp))  %>%
  separate(End_date, c("ed_date","ed_time"), sep = " ") %>%
  separate(ed_time, c("ed_hr","ed_min","ed_sc"), sep = ":") %>%
  mutate(hr = as.numeric(ed_hr) - as.numeric(st_hr)) %>%
  mutate(min = as.numeric(ed_min) - as.numeric(st_min)) %>%
  filter(hr>0) %>%
  mutate(Duration = hr*60 + min) %>%
  select(wkday,Duration,Tips) 
unknown timezone 'default/America/New_York'
  table_tips 
  • Select intervals so that there’s enough data in each cell.
table_tips_n_ob <- table_tips %>%
  group_by(wkday, Duration_interval = cut(Duration,breaks = c(0,15,30,45,60,1500)))%>%
  summarise(ob = n()) %>%
  spread(wkday,ob)
table_tips_n_ob
table_tips_n_ob_group <- table_tips %>%
  group_by(Duration_interval = cut(Duration,breaks = c(0,15,30,45,60,1500)))%>%
  summarise(ob = n())
table_tips_n_ob_group
  • Average tip for each day and duration of trip
table_tips_wkday <- table_tips %>%
  group_by(wkday, Duration_interval = cut(Duration, breaks = c(0,15,30,45,60,1500))) %>%
  summarise(avg_tip = mean(Tips, na.rm = T)) %>%
  spread(wkday,avg_tip)%>%
mutate(Duration_interval = fct_recode(Duration_interval, `15 min` = "(0,15]", 
                                  `30 min` = "(15,30]",
                                  `45 min` = "(30,45]",
                                  `60 min` = "(45,60]",
                                  `Over an hour` = "(60,1.5e+03]"
))
table_tips_wkday

6b. Combine length of trip with number of recods in parentecies - to use as first column

table_tips_n_ob_group$ob<- paste0("(",format(table_tips_n_ob_group$ob),")")
table_tips_wkday %>%
  mutate(n_obsevations = table_tips_n_ob_group$ob) %>%
  unite(`Duration(n of observations)`, Duration_interval, n_obsevations, sep = " ")

Plot this data. Make plot with separation by morning-day-evening-night. Best to drive

  • Get time of week, time of day and corresponding tips.
table_time <- df %>% 
  select(Trip.Start.Timestamp, Tips) %>%
  mutate(Start_date = mdy_hms(Trip.Start.Timestamp))  %>%
  mutate(wkday = wday(Start_date, label = TRUE)) %>%
  separate(Start_date, c("st_date","st_time"), sep = " ") %>%
  separate(st_time, c("st_hr","st_min","st_sc"), sep = ":") %>%
  mutate(start_hr = as.numeric(st_hr)) %>%
  select(Tips, start_hr, wkday) %>%
  group_by(wkday, time_of_day = cut(start_hr, breaks = c(0,6,9,16,20,24))) %>%
  filter(is.na(time_of_day) == F) %>%
  summarise(avg_tip = mean(Tips, na.rm = T)) 
table_time
table_time_of_day <- table_time %>%
  mutate(time_of_day = fct_recode(time_of_day, Before_dawn = "(0,6]", 
                                  Morning = "(6,9]",
                                  Day = "(9,16]",
                                  Evening = "(16,20]",
                                  Night = "(20,24]"
))
ggplot(table_time_of_day, aes(x = time_of_day, y = wkday)) +
  geom_tile(mapping = aes(fill = avg_tip))

According to the graph, the tips are generally higher during Saturday or during the morning.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkobHVicmlkYXRlKQpsaWJyYXJ5KHN0cmluZ3IpCmxpYnJhcnkoZm9yY2F0cykKYGBgCgojIyBDaGljYWdvIHRheGkgZGF0YQpodHRwczovL2RhdGEuY2l0eW9mY2hpY2Fnby5vcmcvVHJhbnNwb3J0YXRpb24vVGF4aS1Ucmlwcy93cnZ6LXBzZXcvZGF0YSAKCiMjIyBEYXRhIGRlc2NyaXB0aW9uOiBUaGlzIGlzIGRhdGEgZnJvbSB0YXhpIHRyaXBzIHJlcG9ydGVkIHRvIHRoZSBjaXR5IG9mIENoaWNhZ28uIFRoZXJlIGFyZSB0b3RhbCAxMTI4NjAwNTQgcm93cyBhbmQgMjMgdmFyaWFibGVzIGluY2x1ZGluZyBpbmZvcm1hdGlvbiBhYm91dCB0aGUgdGF4aSwgdGltZSwgbG9jYXRpb24gYW5kIGV4cGVuc2VzLgoKIyMjIFRpbWUgZXN0aW1hdGlvbjogZG93bmxvYWQgZm9yIDEwMCBzZWNvbmRzLCA1Ny4xTUIsIDE5NDQ1NSByb3dzLiBTbyB0aGUgdGltZSB0YWtlcyB0byBkb3dubG9hZCB3aG9sZSBkYXRhc2V0IHNob3VsZCBiZSBhYm91dCAxNiBob3Vycy4gU2l6ZSBvZiB3aG9sZSBkYXRhIHNldCBzaG91bGQgYmUgYWJvdXQgMzNHQi4KCmBgYHtyfQpkZiA9IHJlYWQuY3N2KCJUYXhpX1RyaXBzLmNzdiIpCmRmCmBgYAoKIyMgQW5hbHlzaXMKIyMjIDYuIEFmdGVyIHlvdSBoYXZlIGRhdGEsIE1ha2UgdGFibGUgd2l0aCBkYXkgb2YgdGhlIHdlZWsgaW4gdGhlIGNvbHVtbiBuYW1lLCBMZW5ndGggb2YgYSB0cmlwIGluIHZlcnRpY2FsICh5b3Ugd2lsbCBoYXZlIHRvIGN1dCBpdCwgdG8gbGV0IHNheSA1IGludGVydmFscyksIGFuZCBhdmVyYWdlIHRpcCBpbiBjZWxscyAKCiMjIzZhLiBCZSBjYXJlZnVsbCAtIGlmIHlvdSBhcHBseSBjdXQgYmxpbmRseSwgdXNpbmcgaW50ZXJ2YWxzLCBvdXRsaWVycyAoaWYgcHJlc2VudCkgY2FuIG1ha2UgdGhpcyBjdXR0aW5nIHVzZWxlc3MuIFlvdSBuZWVkIHRvIHNlZSBob3cgbWFueSByZWNvcmRzIHRoZXJlIGFyZSBpbiBldmVyeSBncm91cC4KCi0gQ2xlYW4gdmFyaWFibGUgIlRpcHMiCmBgYHtyfQpkZiRUaXBzIDwtIHN0cl9yZXBsYWNlX2FsbChkZiRUaXBzLCAiXFwkIiwgIiIpICU+JQogIGFzLm51bWVyaWMoKQpgYGAKCi0gR2V0IHRocmVlIHZhcmlhYmxlcwpgYGB7cn0KdGFibGVfdGlwcyA8LSBkZiAlPiUgCiAgc2VsZWN0KFRyaXAuU3RhcnQuVGltZXN0YW1wLCBUcmlwLkVuZC5UaW1lc3RhbXAsIFRpcHMpICU+JQogIG11dGF0ZShTdGFydF9kYXRlID0gbWR5X2htcyhUcmlwLlN0YXJ0LlRpbWVzdGFtcCkpICAlPiUKICBtdXRhdGUod2tkYXkgPSB3ZGF5KFN0YXJ0X2RhdGUsIGxhYmVsID0gVFJVRSkpICU+JQogIHNlcGFyYXRlKFN0YXJ0X2RhdGUsIGMoInN0X2RhdGUiLCJzdF90aW1lIiksIHNlcCA9ICIgIikgJT4lCiAgc2VwYXJhdGUoc3RfdGltZSwgYygic3RfaHIiLCJzdF9taW4iLCJzdF9zYyIpLCBzZXAgPSAiOiIpICU+JQogIG11dGF0ZShFbmRfZGF0ZSA9IG1keV9obXMoVHJpcC5FbmQuVGltZXN0YW1wKSkgICU+JQogIHNlcGFyYXRlKEVuZF9kYXRlLCBjKCJlZF9kYXRlIiwiZWRfdGltZSIpLCBzZXAgPSAiICIpICU+JQogIHNlcGFyYXRlKGVkX3RpbWUsIGMoImVkX2hyIiwiZWRfbWluIiwiZWRfc2MiKSwgc2VwID0gIjoiKSAlPiUKICBtdXRhdGUoaHIgPSBhcy5udW1lcmljKGVkX2hyKSAtIGFzLm51bWVyaWMoc3RfaHIpKSAlPiUKICBtdXRhdGUobWluID0gYXMubnVtZXJpYyhlZF9taW4pIC0gYXMubnVtZXJpYyhzdF9taW4pKSAlPiUKICBmaWx0ZXIoaHI+MCkgJT4lCiAgbXV0YXRlKER1cmF0aW9uID0gaHIqNjAgKyBtaW4pICU+JQogIHNlbGVjdCh3a2RheSxEdXJhdGlvbixUaXBzKSAKICB0YWJsZV90aXBzIApgYGAKCi0gU2VsZWN0IGludGVydmFscyBzbyB0aGF0IHRoZXJlJ3MgZW5vdWdoIGRhdGEgaW4gZWFjaCBjZWxsLgpgYGB7cn0KdGFibGVfdGlwc19uX29iIDwtIHRhYmxlX3RpcHMgJT4lCiAgZ3JvdXBfYnkod2tkYXksIER1cmF0aW9uX2ludGVydmFsID0gY3V0KER1cmF0aW9uLGJyZWFrcyA9IGMoMCwxNSwzMCw0NSw2MCwxNTAwKSkpJT4lCiAgc3VtbWFyaXNlKG9iID0gbigpKSAlPiUKICBzcHJlYWQod2tkYXksb2IpCnRhYmxlX3RpcHNfbl9vYgoKdGFibGVfdGlwc19uX29iX2dyb3VwIDwtIHRhYmxlX3RpcHMgJT4lCiAgZ3JvdXBfYnkoRHVyYXRpb25faW50ZXJ2YWwgPSBjdXQoRHVyYXRpb24sYnJlYWtzID0gYygwLDE1LDMwLDQ1LDYwLDE1MDApKSklPiUKICBzdW1tYXJpc2Uob2IgPSBuKCkpCnRhYmxlX3RpcHNfbl9vYl9ncm91cApgYGAKCi0gQXZlcmFnZSB0aXAgZm9yIGVhY2ggZGF5IGFuZCBkdXJhdGlvbiBvZiB0cmlwCmBgYHtyfQp0YWJsZV90aXBzX3drZGF5IDwtIHRhYmxlX3RpcHMgJT4lCiAgZ3JvdXBfYnkod2tkYXksIER1cmF0aW9uX2ludGVydmFsID0gY3V0KER1cmF0aW9uLCBicmVha3MgPSBjKDAsMTUsMzAsNDUsNjAsMTUwMCkpKSAlPiUKICBzdW1tYXJpc2UoYXZnX3RpcCA9IG1lYW4oVGlwcywgbmEucm0gPSBUKSkgJT4lCiAgc3ByZWFkKHdrZGF5LGF2Z190aXApJT4lCm11dGF0ZShEdXJhdGlvbl9pbnRlcnZhbCA9IGZjdF9yZWNvZGUoRHVyYXRpb25faW50ZXJ2YWwsIGAxNSBtaW5gID0gIigwLDE1XSIsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYDMwIG1pbmAgPSAiKDE1LDMwXSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBgNDUgbWluYCA9ICIoMzAsNDVdIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGA2MCBtaW5gID0gIig0NSw2MF0iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYE92ZXIgYW4gaG91cmAgPSAiKDYwLDEuNWUrMDNdIgopKQp0YWJsZV90aXBzX3drZGF5CmBgYAoKCgojIyM2Yi4gQ29tYmluZSBsZW5ndGggb2YgdHJpcCB3aXRoIG51bWJlciBvZiByZWNvZHMgaW4gcGFyZW50ZWNpZXMgLSB0byB1c2UgYXMgZmlyc3QgY29sdW1uIAoKYGBge3J9CnRhYmxlX3RpcHNfbl9vYl9ncm91cCRvYjwtIHBhc3RlMCgiKCIsZm9ybWF0KHRhYmxlX3RpcHNfbl9vYl9ncm91cCRvYiksIikiKQp0YWJsZV90aXBzX3drZGF5ICU+JQogIG11dGF0ZShuX29ic2V2YXRpb25zID0gdGFibGVfdGlwc19uX29iX2dyb3VwJG9iKSAlPiUKICB1bml0ZShgRHVyYXRpb24obiBvZiBvYnNlcnZhdGlvbnMpYCwgRHVyYXRpb25faW50ZXJ2YWwsIG5fb2JzZXZhdGlvbnMsIHNlcCA9ICIgIikKYGBgCgoKIyMjUGxvdCB0aGlzIGRhdGEuIE1ha2UgcGxvdCB3aXRoIHNlcGFyYXRpb24gYnkgbW9ybmluZy1kYXktZXZlbmluZy1uaWdodC4gQmVzdCB0byBkcml2ZSAKCi0gR2V0IHRpbWUgb2Ygd2VlaywgdGltZSBvZiBkYXkgYW5kIGNvcnJlc3BvbmRpbmcgdGlwcy4KYGBge3J9CnRhYmxlX3RpbWUgPC0gZGYgJT4lIAogIHNlbGVjdChUcmlwLlN0YXJ0LlRpbWVzdGFtcCwgVGlwcykgJT4lCiAgbXV0YXRlKFN0YXJ0X2RhdGUgPSBtZHlfaG1zKFRyaXAuU3RhcnQuVGltZXN0YW1wKSkgICU+JQogIG11dGF0ZSh3a2RheSA9IHdkYXkoU3RhcnRfZGF0ZSwgbGFiZWwgPSBUUlVFKSkgJT4lCiAgc2VwYXJhdGUoU3RhcnRfZGF0ZSwgYygic3RfZGF0ZSIsInN0X3RpbWUiKSwgc2VwID0gIiAiKSAlPiUKICBzZXBhcmF0ZShzdF90aW1lLCBjKCJzdF9ociIsInN0X21pbiIsInN0X3NjIiksIHNlcCA9ICI6IikgJT4lCiAgbXV0YXRlKHN0YXJ0X2hyID0gYXMubnVtZXJpYyhzdF9ocikpICU+JQogIHNlbGVjdChUaXBzLCBzdGFydF9ociwgd2tkYXkpICU+JQogIGdyb3VwX2J5KHdrZGF5LCB0aW1lX29mX2RheSA9IGN1dChzdGFydF9ociwgYnJlYWtzID0gYygwLDYsOSwxNiwyMCwyNCkpKSAlPiUKICBmaWx0ZXIoaXMubmEodGltZV9vZl9kYXkpID09IEYpICU+JQogIHN1bW1hcmlzZShhdmdfdGlwID0gbWVhbihUaXBzLCBuYS5ybSA9IFQpKSAKdGFibGVfdGltZQpgYGAKYGBge3J9CnRhYmxlX3RpbWVfb2ZfZGF5IDwtIHRhYmxlX3RpbWUgJT4lCiAgbXV0YXRlKHRpbWVfb2ZfZGF5ID0gZmN0X3JlY29kZSh0aW1lX29mX2RheSwgQmVmb3JlX2Rhd24gPSAiKDAsNl0iLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE1vcm5pbmcgPSAiKDYsOV0iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRGF5ID0gIig5LDE2XSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBFdmVuaW5nID0gIigxNiwyMF0iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTmlnaHQgPSAiKDIwLDI0XSIKKSkKCmdncGxvdCh0YWJsZV90aW1lX29mX2RheSwgYWVzKHggPSB0aW1lX29mX2RheSwgeSA9IHdrZGF5KSkgKwogIGdlb21fdGlsZShtYXBwaW5nID0gYWVzKGZpbGwgPSBhdmdfdGlwKSkKYGBgCkFjY29yZGluZyB0byB0aGUgZ3JhcGgsIHRoZSB0aXBzIGFyZSBnZW5lcmFsbHkgaGlnaGVyIGR1cmluZyBTYXR1cmRheSBvciBkdXJpbmcgdGhlIG1vcm5pbmcuCg==